clear 
clear matrix
set memory 1000m
set more off
cap log close

cd "/Users/..."

global do_file="‎⁨/Users/.../do_file"
global log_file="/Users/.../log_file"
global raw_data="/Users/.../raw_data⁩⁩"
global working_data="/Users/.../working_data"
global results="/Users/.../results"

log using log_file/clean_081519.log, replace

********************************
* First generate trip-level data
********************************

use raw_data/trip_event_all.dta, clear

drop id driver_id session_id tracking_id

gen temp=substr(date,1,4)
destring temp, replace
tab temp,m
drop if temp<2015
drop temp

rename date trip_created_date

order user_id trip_id


gsort user_id trip_id event_start_time event_id -zendrive_score latitude_start longitude_start

bysort user_id trip_id event_start_time event_id: drop if _n>1

gen double event_start_time_new=clock(event_start_time, "YMDhms")
format event_start_time_new %tc

gen double event_end_time_new=clock(event_end_time, "YMDhms")
format event_end_time_new %tc


** compute the time diff between agg acc and hard brakes


* seconds since the last aggressive accelerations
gen temp1=(event_start_time_new-event_start_time_new[_n-1])/1000 if event_type=="HardBrake" & event_type[_n-1]=="AggressiveAcceleration" & user_id==user_id[_n-1] & trip_id==trip_id[_n-1]
gen temp2=(event_start_time_new-event_start_time_new[_n-2])/1000 if event_type=="HardBrake" & event_type[_n-2]=="AggressiveAcceleration" & user_id==user_id[_n-2] & trip_id==trip_id[_n-2] & temp1==.
gen temp3=(event_start_time_new-event_start_time_new[_n-3])/1000 if event_type=="HardBrake" & event_type[_n-3]=="AggressiveAcceleration" & user_id==user_id[_n-3] & trip_id==trip_id[_n-3] & temp1==. & temp2==.
gen temp4=(event_start_time_new-event_start_time_new[_n-4])/1000 if event_type=="HardBrake" & event_type[_n-4]=="AggressiveAcceleration" & user_id==user_id[_n-4] & trip_id==trip_id[_n-4] & temp1==. & temp2==. & temp3==.
gen temp5=(event_start_time_new-event_start_time_new[_n-5])/1000 if event_type=="HardBrake" & event_type[_n-5]=="AggressiveAcceleration" & user_id==user_id[_n-5] & trip_id==trip_id[_n-5] & temp1==. & temp2==. & temp3==. & temp4==.
gen temp6=(event_start_time_new-event_start_time_new[_n-6])/1000 if event_type=="HardBrake" & event_type[_n-6]=="AggressiveAcceleration" & user_id==user_id[_n-6] & trip_id==trip_id[_n-6] & temp1==. & temp2==. & temp3==. & temp4==. & temp5==.
gen temp7=(event_start_time_new-event_start_time_new[_n-7])/1000 if event_type=="HardBrake" & event_type[_n-7]=="AggressiveAcceleration" & user_id==user_id[_n-7] & trip_id==trip_id[_n-7] & temp1==. & temp2==. & temp3==. & temp4==. & temp5==. & temp6==.
gen temp8=(event_start_time_new-event_start_time_new[_n-8])/1000 if event_type=="HardBrake" & event_type[_n-8]=="AggressiveAcceleration" & user_id==user_id[_n-8] & trip_id==trip_id[_n-8] & temp1==. & temp2==. & temp3==. & temp4==. & temp5==. & temp6==. & temp7==.
gen temp9=(event_start_time_new-event_start_time_new[_n-9])/1000 if event_type=="HardBrake" & event_type[_n-9]=="AggressiveAcceleration" & user_id==user_id[_n-9] & trip_id==trip_id[_n-9] & temp1==. & temp2==. & temp3==. & temp4==. & temp5==. & temp6==. & temp7==. & temp8==.
gen temp10=(event_start_time_new-event_start_time_new[_n-10])/1000 if event_type=="HardBrake" & event_type[_n-10]=="AggressiveAcceleration" & user_id==user_id[_n-10] & trip_id==trip_id[_n-10] & temp1==. & temp2==. & temp3==. & temp4==. & temp5==. & temp6==. & temp7==. & temp8==. & temp9==.
gen temp11=(event_start_time_new-event_start_time_new[_n-11])/1000 if event_type=="HardBrake" & event_type[_n-11]=="AggressiveAcceleration" & user_id==user_id[_n-11] & trip_id==trip_id[_n-11] & temp1==. & temp2==. & temp3==. & temp4==. & temp5==. & temp6==. & temp7==. & temp8==. & temp9==. & temp10==.
gen temp12=(event_start_time_new-event_start_time_new[_n-12])/1000 if event_type=="HardBrake" & event_type[_n-12]=="AggressiveAcceleration" & user_id==user_id[_n-12] & trip_id==trip_id[_n-12] & temp1==. & temp2==. & temp3==. & temp4==. & temp5==. & temp6==. & temp7==. & temp8==. & temp9==. & temp10==. & temp11==.
gen temp13=(event_start_time_new-event_start_time_new[_n-13])/1000 if event_type=="HardBrake" & event_type[_n-13]=="AggressiveAcceleration" & user_id==user_id[_n-13] & trip_id==trip_id[_n-13] & temp1==. & temp2==. & temp3==. & temp4==. & temp5==. & temp6==. & temp7==. & temp8==. & temp9==. & temp10==. & temp11==. & temp12==.
gen temp14=(event_start_time_new-event_start_time_new[_n-14])/1000 if event_type=="HardBrake" & event_type[_n-14]=="AggressiveAcceleration" & user_id==user_id[_n-14] & trip_id==trip_id[_n-14] & temp1==. & temp2==. & temp3==. & temp4==. & temp5==. & temp6==. & temp7==. & temp8==. & temp9==. & temp10==. & temp11==. & temp12==. & temp13==.
gen temp15=(event_start_time_new-event_start_time_new[_n-15])/1000 if event_type=="HardBrake" & event_type[_n-15]=="AggressiveAcceleration" & user_id==user_id[_n-15] & trip_id==trip_id[_n-15] & temp1==. & temp2==. & temp3==. & temp4==. & temp5==. & temp6==. & temp7==. & temp8==. & temp9==. & temp10==. & temp11==. & temp12==. & temp13==. & temp14==.
gen temp16=(event_start_time_new-event_start_time_new[_n-16])/1000 if event_type=="HardBrake" & event_type[_n-16]=="AggressiveAcceleration" & user_id==user_id[_n-16] & trip_id==trip_id[_n-16] & temp1==. & temp2==. & temp3==. & temp4==. & temp5==. & temp6==. & temp7==. & temp8==. & temp9==. & temp10==. & temp11==. & temp12==. & temp13==. & temp14==. & temp15==.
gen temp17=(event_start_time_new-event_start_time_new[_n-17])/1000 if event_type=="HardBrake" & event_type[_n-17]=="AggressiveAcceleration" & user_id==user_id[_n-17] & trip_id==trip_id[_n-17] & temp1==. & temp2==. & temp3==. & temp4==. & temp5==. & temp6==. & temp7==. & temp8==. & temp9==. & temp10==. & temp11==. & temp12==. & temp13==. & temp14==. & temp15==. & temp16==.
gen temp18=(event_start_time_new-event_start_time_new[_n-18])/1000 if event_type=="HardBrake" & event_type[_n-18]=="AggressiveAcceleration" & user_id==user_id[_n-18] & trip_id==trip_id[_n-18] & temp1==. & temp2==. & temp3==. & temp4==. & temp5==. & temp6==. & temp7==. & temp8==. & temp9==. & temp10==. & temp11==. & temp12==. & temp13==. & temp14==. & temp15==. & temp16==. & temp17==.
gen temp19=(event_start_time_new-event_start_time_new[_n-19])/1000 if event_type=="HardBrake" & event_type[_n-19]=="AggressiveAcceleration" & user_id==user_id[_n-19] & trip_id==trip_id[_n-19] & temp1==. & temp2==. & temp3==. & temp4==. & temp5==. & temp6==. & temp7==. & temp8==. & temp9==. & temp10==. & temp11==. & temp12==. & temp13==. & temp14==. & temp15==. & temp16==. & temp17==. & temp18==.
gen temp20=(event_start_time_new-event_start_time_new[_n-20])/1000 if event_type=="HardBrake" & event_type[_n-20]=="AggressiveAcceleration" & user_id==user_id[_n-20] & trip_id==trip_id[_n-20] & temp1==. & temp2==. & temp3==. & temp4==. & temp5==. & temp6==. & temp7==. & temp8==. & temp9==. & temp10==. & temp11==. & temp12==. & temp13==. & temp14==. & temp15==. & temp16==. & temp17==. & temp18==. & temp19==.

sum temp20 if temp20~=0

egen temp_sum=rowtotal(temp1-temp20)

gen sec_since_agg=.

forvalues i=1/20{
replace sec_since_agg=temp`i' if temp`i'~=. & sec_since_agg==.
}

drop temp1-temp20


* seconds before the next hard brakes
gen temp1=(event_start_time_new[_n+1]-event_start_time_new)/1000 if event_type=="AggressiveAcceleration" & event_type[_n+1]=="HardBrake" & user_id==user_id[_n+1] & trip_id==trip_id[_n+1]
gen temp2=(event_start_time_new[_n+2]-event_start_time_new)/1000 if event_type=="AggressiveAcceleration" & event_type[_n+2]=="HardBrake" & user_id==user_id[_n+2] & trip_id==trip_id[_n+2] & temp1==.
gen temp3=(event_start_time_new[_n+3]-event_start_time_new)/1000 if event_type=="AggressiveAcceleration" & event_type[_n+3]=="HardBrake" & user_id==user_id[_n+3] & trip_id==trip_id[_n+3] & temp1==. & temp2==.
gen temp4=(event_start_time_new[_n+4]-event_start_time_new)/1000 if event_type=="AggressiveAcceleration" & event_type[_n+4]=="HardBrake" & user_id==user_id[_n+4] & trip_id==trip_id[_n+4] & temp1==. & temp2==. & temp3==.
gen temp5=(event_start_time_new[_n+5]-event_start_time_new)/1000 if event_type=="AggressiveAcceleration" & event_type[_n+5]=="HardBrake" & user_id==user_id[_n+5] & trip_id==trip_id[_n+5] & temp1==. & temp2==. & temp3==. & temp4==.
gen temp6=(event_start_time_new[_n+6]-event_start_time_new)/1000 if event_type=="AggressiveAcceleration" & event_type[_n+6]=="HardBrake" & user_id==user_id[_n+6] & trip_id==trip_id[_n+6] & temp1==. & temp2==. & temp3==. & temp4==. & temp5==.
gen temp7=(event_start_time_new[_n+7]-event_start_time_new)/1000 if event_type=="AggressiveAcceleration" & event_type[_n+7]=="HardBrake" & user_id==user_id[_n+7] & trip_id==trip_id[_n+7] & temp1==. & temp2==. & temp3==. & temp4==. & temp5==. & temp6==.
gen temp8=(event_start_time_new[_n+8]-event_start_time_new)/1000 if event_type=="AggressiveAcceleration" & event_type[_n+8]=="HardBrake" & user_id==user_id[_n+8] & trip_id==trip_id[_n+8] & temp1==. & temp2==. & temp3==. & temp4==. & temp5==. & temp6==. & temp7==.
gen temp9=(event_start_time_new[_n+9]-event_start_time_new)/1000 if event_type=="AggressiveAcceleration" & event_type[_n+9]=="HardBrake" & user_id==user_id[_n+9] & trip_id==trip_id[_n+9] & temp1==. & temp2==. & temp3==. & temp4==. & temp5==. & temp6==. & temp7==. & temp8==.
gen temp10=(event_start_time_new[_n+10]-event_start_time_new)/1000 if event_type=="AggressiveAcceleration" & event_type[_n+10]=="HardBrake" & user_id==user_id[_n+10] & trip_id==trip_id[_n+10] & temp1==. & temp2==. & temp3==. & temp4==. & temp5==. & temp6==. & temp7==. & temp8==. & temp9==.
gen temp11=(event_start_time_new[_n+11]-event_start_time_new)/1000 if event_type=="AggressiveAcceleration" & event_type[_n+11]=="HardBrake" & user_id==user_id[_n+11] & trip_id==trip_id[_n+11] & temp1==. & temp2==. & temp3==. & temp4==. & temp5==. & temp6==. & temp7==. & temp8==. & temp9==. & temp10==.
gen temp12=(event_start_time_new[_n+12]-event_start_time_new)/1000 if event_type=="AggressiveAcceleration" & event_type[_n+12]=="HardBrake" & user_id==user_id[_n+12] & trip_id==trip_id[_n+12] & temp1==. & temp2==. & temp3==. & temp4==. & temp5==. & temp6==. & temp7==. & temp8==. & temp9==. & temp10==. & temp11==.
gen temp13=(event_start_time_new[_n+13]-event_start_time_new)/1000 if event_type=="AggressiveAcceleration" & event_type[_n+13]=="HardBrake" & user_id==user_id[_n+13] & trip_id==trip_id[_n+13] & temp1==. & temp2==. & temp3==. & temp4==. & temp5==. & temp6==. & temp7==. & temp8==. & temp9==. & temp10==. & temp11==. & temp12==.
gen temp14=(event_start_time_new[_n+14]-event_start_time_new)/1000 if event_type=="AggressiveAcceleration" & event_type[_n+14]=="HardBrake" & user_id==user_id[_n+14] & trip_id==trip_id[_n+14] & temp1==. & temp2==. & temp3==. & temp4==. & temp5==. & temp6==. & temp7==. & temp8==. & temp9==. & temp10==. & temp11==. & temp12==. & temp13==.
gen temp15=(event_start_time_new[_n+15]-event_start_time_new)/1000 if event_type=="AggressiveAcceleration" & event_type[_n+15]=="HardBrake" & user_id==user_id[_n+15] & trip_id==trip_id[_n+15] & temp1==. & temp2==. & temp3==. & temp4==. & temp5==. & temp6==. & temp7==. & temp8==. & temp9==. & temp10==. & temp11==. & temp12==. & temp13==. & temp14==.
gen temp16=(event_start_time_new[_n+16]-event_start_time_new)/1000 if event_type=="AggressiveAcceleration" & event_type[_n+16]=="HardBrake" & user_id==user_id[_n+16] & trip_id==trip_id[_n+16] & temp1==. & temp2==. & temp3==. & temp4==. & temp5==. & temp6==. & temp7==. & temp8==. & temp9==. & temp10==. & temp11==. & temp12==. & temp13==. & temp14==. & temp15==.
gen temp17=(event_start_time_new[_n+17]-event_start_time_new)/1000 if event_type=="AggressiveAcceleration" & event_type[_n+17]=="HardBrake" & user_id==user_id[_n+17] & trip_id==trip_id[_n+17] & temp1==. & temp2==. & temp3==. & temp4==. & temp5==. & temp6==. & temp7==. & temp8==. & temp9==. & temp10==. & temp11==. & temp12==. & temp13==. & temp14==. & temp15==. & temp16==.
gen temp18=(event_start_time_new[_n+18]-event_start_time_new)/1000 if event_type=="AggressiveAcceleration" & event_type[_n+18]=="HardBrake" & user_id==user_id[_n+18] & trip_id==trip_id[_n+18] & temp1==. & temp2==. & temp3==. & temp4==. & temp5==. & temp6==. & temp7==. & temp8==. & temp9==. & temp10==. & temp11==. & temp12==. & temp13==. & temp14==. & temp15==. & temp16==. & temp17==.
gen temp19=(event_start_time_new[_n+19]-event_start_time_new)/1000 if event_type=="AggressiveAcceleration" & event_type[_n+19]=="HardBrake" & user_id==user_id[_n+19] & trip_id==trip_id[_n+19] & temp1==. & temp2==. & temp3==. & temp4==. & temp5==. & temp6==. & temp7==. & temp8==. & temp9==. & temp10==. & temp11==. & temp12==. & temp13==. & temp14==. & temp15==. & temp16==. & temp17==. & temp18==.
gen temp20=(event_start_time_new[_n+20]-event_start_time_new)/1000 if event_type=="AggressiveAcceleration" & event_type[_n+20]=="HardBrake" & user_id==user_id[_n+20] & trip_id==trip_id[_n+20] & temp1==. & temp2==. & temp3==. & temp4==. & temp5==. & temp6==. & temp7==. & temp8==. & temp9==. & temp10==. & temp11==. & temp12==. & temp13==. & temp14==. & temp15==. & temp16==. & temp17==. & temp18==. & temp19==.

sum temp20 if temp20~=0

egen temp_sum_new=rowtotal(temp1-temp20)

gen sec_before_hb=.

forvalues i=1/20{
replace sec_before_hb=temp`i' if temp`i'~=. & sec_before_hb==.
}

drop temp1-temp20






gen temp_hard_brake=1 if event_type=="HardBrake"
replace temp_hard_brake=999 if event_type~="HardBrake"


gen temp_hard_turn=1 if event_type=="HardLeftTurn" | event_type=="HardRightTurn" | event_type=="HardUTurn"
replace temp_hard_turn=999 if temp_hard_turn==.

gen temp_brake_turn=1 if temp_hard_brake==1 | temp_hard_turn==1
replace temp_brake_turn=999 if temp_brake_turn==.





** sorting based only on hard brakes
sort user_id trip_id temp_hard_brake event_start_time_new

gen sec_since_last_nm=(event_start_time_new-event_start_time_new[_n-1])/1000 if temp_hard_brake==1 & temp_hard_brake[_n-1]==1 & trip_id==trip_id[_n-1]
sum sec_since_last_nm, d


** sorting based on hard brakes and turns
sort user_id trip_id temp_brake_turn event_start_time_new

gen sec_since_last_hard_turn=(event_start_time_new-event_start_time_new[_n-1])/1000 if temp_hard_brake==1 & temp_hard_turn[_n-1]==1 & trip_id==trip_id[_n-1]  
gen sec_since_next_hard_turn=(event_start_time_new[_n+1]-event_start_time_new)/1000 if temp_hard_brake==1 & temp_hard_turn[_n+1]==1 & trip_id==trip_id[_n+1]  

sum sec_since_last_hard_turn, d
sum sec_since_next_hard_turn, d



**************************************
* Construct measurements for Near-Miss
**************************************

 
*-- hard brakes with 5 min since the last hard brake
gen hard_brake_five_minute=temp_hard_brake
replace hard_brake_five_minute=0 if hard_brake_five_minute==1 & sec_since_last_nm<=300
replace hard_brake_five_minute=. if hard_brake_five_minute==999
bysort user_id trip_id: egen total_hard_brake_1=total(hard_brake_five_minute)
drop hard_brake_five_minute  

*-- hard brakes with 10 min since the last hard brake
gen hard_brake_ten_minute=temp_hard_brake
replace hard_brake_ten_minute=0 if hard_brake_ten_minute==1 & sec_since_last_nm<=600
replace hard_brake_ten_minute=. if hard_brake_ten_minute==999
bysort user_id trip_id: egen total_hard_brake_2=total(hard_brake_ten_minute)
drop hard_brake_ten_minute  


*-- hard brakes with 10 min since the last hard brake
gen hard_brake_fift_minute=temp_hard_brake
replace hard_brake_fift_minute=0 if hard_brake_fift_minute==1 & sec_since_last_nm<=900
replace hard_brake_fift_minute=. if hard_brake_fift_minute==999
bysort user_id trip_id: egen total_hard_brake_3=total(hard_brake_fift_minute)
drop hard_brake_fift_minute  


*-- hard brakes that happen very close to a hard turn: +-150 seconds
gen hard_brake_close_turn=1 if temp_hard_brake==1 & (sec_since_last_hard_turn<=150 | sec_since_next_hard_turn<=150)
bysort user_id trip_id: egen total_hard_brake_close_turn_1=total(hard_brake_close_turn)
drop hard_brake_close_turn


*-- hard brakes that happen very close to a hard turn: +-60 seconds
gen hard_brake_close_turn=1 if temp_hard_brake==1 & (sec_since_last_hard_turn<=60 | sec_since_next_hard_turn<=60)
bysort user_id trip_id: egen total_hard_brake_close_turn_2=total(hard_brake_close_turn)
drop hard_brake_close_turn


*-- hard brakes that happen very close to a hard turn: +-30 seconds
gen hard_brake_close_turn=1 if temp_hard_brake==1 & (sec_since_last_hard_turn<=30 | sec_since_next_hard_turn<=30)
bysort user_id trip_id: egen total_hard_brake_close_turn_3=total(hard_brake_close_turn)
drop hard_brake_close_turn


sort user_id trip_id event_start_time_new
*-- hard brakes when there is no aggressive acceleration in the last 5 mins
sum sec_since_agg, d
gen temp_brake_1=1 if event_type=="HardBrake" & sec_since_agg>300
bysort user_id trip_id: egen total_real_hard_brake_1=total(temp_brake_1)

*-- hard brakes when there is no aggressive acceleration in the last 15 mins
gen temp_brake_2=1 if event_type=="HardBrake" & sec_since_agg>900
bysort user_id trip_id: egen total_real_hard_brake_2=total(temp_brake_2)

drop temp_brake_1 temp_brake_2 temp_sum

*-- hard brakes when there is no aggressive acceleration
gen temp=1 if event_type=="AggressiveAcceleration"
bysort user_id trip_id: egen temp_total=total(temp)
drop temp

gen temp=1 if event_type=="HardBrake"
bysort user_id trip_id: egen total_prev_hard_brake=total(temp) if temp_total==0
replace total_prev_hard_brake=0 if total_prev_hard_brake==.
drop temp temp_total

*-- hard brakes original
gen temp=1 if event_type=="HardBrake"
bysort user_id trip_id: egen total_hard_brake=total(temp)
drop temp


*-- hard left turn
gen temp=1 if event_type=="HardLeftTurn"
bysort user_id trip_id: egen total_hard_left=total(temp)
drop temp

*-- hard right turn
gen temp=1 if event_type=="HardRightTurn"
bysort user_id trip_id: egen total_hard_right=total(temp)
drop temp

*-- hard u turn
gen temp=1 if event_type=="HardUTurn"
bysort user_id trip_id: egen total_hard_u=total(temp)
drop temp

*-- hard turn all
bysort user_id trip_id: gen total_hard_turn=total_hard_left+total_hard_right+total_hard_u



*-- aggressive accelerations when there is no hard brakes
gen temp=1 if event_type=="HardBrake"
bysort user_id trip_id: egen temp_total=total(temp)
drop temp

gen temp=1 if event_type=="AggressiveAcceleration"
bysort user_id trip_id: egen total_agg_acc_alone=total(temp) if temp_total==0
replace total_agg_acc_alone=0 if total_agg_acc_alone==.
drop temp temp_total


*-- aggressive accelerations when there is no hard brakes or hard turns

gen temp=1 if event_type=="HardBrake" | event_type=="HardLeftTurn" | event_type=="HardRightTurn" | event_type=="HardUTurn"
bysort user_id trip_id: egen temp_total=total(temp)
drop temp

gen temp=1 if event_type=="AggressiveAcceleration"
bysort user_id trip_id: egen total_agg_acc_only=total(temp) if temp_total==0
replace total_agg_acc_only=0 if total_agg_acc_only==.
drop temp temp_total

*-- aggressive accelerations when there is no hard brakes in the next 5 mins
sum sec_before_hb, d
gen temp_agg_1=1 if event_type=="AggressiveAcceleration" & sec_before_hb>300
bysort user_id trip_id: egen total_real_agg_acc_1=total(temp_agg_1)


*-- aggressive accelerations when there is no hard brakes in the next 15 mins
gen temp_agg_2=1 if event_type=="AggressiveAcceleration" & sec_before_hb>900
bysort user_id trip_id: egen total_real_agg_acc_2=total(temp_agg_2)

drop temp_agg_1 temp_agg_2 temp_sum_new



drop temp_hard_brake temp_hard_turn temp_brake_turn sec_since_agg sec_before_hb sec_since_last_nm sec_since_last_hard_turn sec_since_next_hard_turn








*******************************************
* Construct measurements for risky behavior
*******************************************


tab event_type,m

gen temp=1 if event_type=="AggressiveAcceleration"
bysort user_id trip_id: egen total_agg_acc=total(temp)
drop temp

gen temp=1 if event_type=="PhoneUse"
bysort user_id trip_id: egen total_phone_use=total(temp)
drop temp

gen temp=1 if event_type=="TrafficJam"
bysort user_id trip_id: egen total_traffic_jam=total(temp)
drop temp

gen temp=1 if event_type=="Accident" | event_type=="NearAccident"
bysort user_id trip_id: egen total_acc=total(temp)
drop temp

gen temp=(event_end_time_new-event_start_time_new)/1000 if event_type=="PhoneUse"
bysort user_id trip_id: egen total_phone_use_sec=total(temp)
drop temp

gen temp=(event_end_time_new-event_start_time_new)/1000 if event_type=="TrafficJam"
bysort user_id trip_id: egen total_traffic_sec=total(temp) 
drop temp




*******************************************
* Generate trip-level dataset
*******************************************


**!! need to sort before dropping multiple events within a trip
sort user_id trip_id event_start_time event_id
bysort user_id trip_id: keep if _n==1




sum total_phone_use_sec, d
sum total_traffic_sec, d

sort user_id trip_id

gen trip_start_hours=substr(start_time, 12, 2)
destring trip_start_hours, replace
tab trip_start_hours,m
gen drive_at_night=1 if trip_start_hours>=21 | trip_start_hours<=6
replace drive_at_night=0 if drive_at_night==.

gen trip_start_date=substr(start_time, 1, 10)
gen temp=date(trip_start_date, "YMD")
gen day_of_week=dow(temp)
drop temp

gen weekend=1 if day_of_week==0 | day_of_week==6
replace weekend=0 if weekend==.

drop fuel_efficiency_score status 
drop event_id event_latitude_start event_longitude_start event_latitude_end event_longitude_end event_start_time event_end_time ///
event_type event_created_at updated_at event_start_time_new event_end_time_new 


** generate driving durations (in hours), drop outliers
gen double start_time_new=clock(start_time, "YMDhms")
format start_time_new %tc
drop start_time

gen double end_time_new=clock(end_time, "YMDhms")
format end_time_new %tc
drop end_time

gen duration=(end_time_new-start_time_new)/1000
gen temp=duration-drive_time_hours
tab temp,m

drop if temp<0
drop temp drive_time_hours

replace duration=duration/3600
gen speed=distance_km/duration
tab duration if speed==.
drop if duration==0

** Now check for outliers at the trip level

list duration distance_km speed if distance_km>4800  & speed>200
geodist latitude_start longitude_start latitude_end longitude_end if distance_km>4800  & speed>200, gen(temp_dist)

gen speed_new=temp_dist/duration
list temp_dist distance_km duration speed speed_new if distance_km>4800  & speed>200

gen orig_dist=distance_km

replace distance_km=temp_dist if orig_dist>4800 & speed>200 & speed_new<200
replace speed=speed_new if orig_dist>4800 & speed>200 & speed_new<200

list distance_km duration speed if duration>24

* change outliers to be missing value
replace duration=. if duration>24
replace distance_km=. if distance_km>4800
replace speed=distance_km/duration
replace speed=. if speed>200

drop temp_dist speed_new orig_dist

rename distance_km distance 

sum control_score, d
replace control_score=. if control_score<0

sum cautious_score, d
replace cautious_score=. if cautious_score<0

sum focused_score, d
replace focused_score=. if focused_score<0

sum zendrive_score, d
rename zendrive_score drive_score
replace drive_score=. if drive_score<0


// fill in the missing values for cautious_score and drive_score
bysort user_id: egen control_score_mean=mean(control_score)
bysort user_id: egen cautious_score_mean=mean(cautious_score)
bysort user_id: egen focused_score_mean=mean(focused_score)
bysort user_id: egen drive_score_mean=mean(drive_score)

replace control_score=control_score_mean if control_score==. & control_score_mean~=.
replace cautious_score=cautious_score_mean if cautious_score==. & cautious_score_mean~=.
replace focused_score=focused_score_mean if focused_score==. & focused_score_mean~=.
replace drive_score=drive_score_mean if drive_score==. & drive_score_mean~=.

replace drive_score=(control_score+cautious_score+focused_score)/3 if drive_score==.

drop control_score_mean cautious_score_mean focused_score_mean drive_score_mean 




* generate rush hour dummy
rename trip_start_hour hour

gen rush_hour=1 if hour==7 | hour==8 | hour==9 | hour==17 | hour==18 | hour==19
replace rush_hour=0 if rush_hour==.

* generate gap_time
sort user_id start_time_new
bysort user_id: gen trip_index=_n

tsset user_id trip_index

gen gap_time=(start_time_new-l.end_time_new)/(1000*3600)

gen temp=gap_time if gap_time<0
bysort user_id trip_start_date: egen temp1=total(temp) 
drop if temp1<0

drop temp temp1 trip_index


drop trip_created_date trip_created_at
order user_id trip_start_date trip_id start_time_new end_time_new  


*save working_data/clean_trip_level_052019.dta, replace


**************************
* Add Geo Info 
**************************

*use working_data/clean_trip_level_052019.dta, clear

tostring longitude_start, gen(long_start) force
tostring longitude_end, gen(long_end) force
tostring latitude_start, gen(lat_start) force
tostring latitude_end, gen(lat_end) force

sum longitude_start, d
sum longitude_end, d
sum latitude_start, d
sum latitude_end, d


replace long_start="." if longitude_start<0
replace long_end="." if longitude_end<0
replace lat_start="." if latitude_start<0
replace lat_end="." if latitude_end<0


gen lat_start_new=substr(lat_start, 1, 3) if latitude_start<10
replace lat_start_new=substr(lat_start, 1, 4) if latitude_start>10 & latitude_start<100
replace lat_start_new=substr(lat_start, 1, 5) if latitude_start>100

gen long_start_new=substr(long_start, 1, 3) if longitude_start<10
replace long_start_new=substr(long_start, 1, 4) if longitude_start>10 & longitude_start<100
replace long_start_new=substr(long_start, 1, 5) if longitude_start>100

gen lat_end_new=substr(lat_end, 1, 3) if latitude_end<10
replace lat_end_new=substr(lat_end, 1, 4) if latitude_end>10 & latitude_end<100
replace lat_end_new=substr(lat_end, 1, 5) if latitude_end>100

gen long_end_new=substr(long_end, 1, 3) if longitude_end<10
replace long_end_new=substr(long_end, 1, 4) if longitude_end>10 & longitude_end<100
replace long_end_new=substr(long_end, 1, 5) if longitude_end>100



gen lat_long_start="("+lat_start_new+","+long_start_new+")"
gen lat_long_end="("+lat_end_new+","+long_end_new+")"

drop long_end lat_end long_start lat_start lat_start_new long_start_new lat_end_new long_end_new


* find trips that does not have gps info, replaced with the driver's most frequent location

bysort user_id: gen temp=1 if lat_long_start=="(.,.)"
bysort user_id: gen temp1=1 if lat_long_start~="(.,.)"
bysort user_id: egen temp_total=total(temp)
bysort user_id: egen temp_total1=total(temp1)
bysort user_id: egen fre_loc=mode(lat_long_start) if temp_total>0 & temp_total1~=0, minmode

replace lat_long_start=fre_loc if lat_long_start=="(.,.)" & fre_loc~="(.,.)" & temp_total>0 & temp_total1~=0

sort user_id temp1 start_time_new
count if fre_loc=="(.,.)"
replace lat_long_start=lat_long_start[_n-1] if user_id==user_id[_n-1] & lat_long_start=="(.,.)" & lat_long_start[_n-1]~="(.,.)"
replace lat_long_start=lat_long_start[_n-1] if user_id==user_id[_n-1] & lat_long_start=="(0,0)" & lat_long_start[_n-1]~="(.,.)" & lat_long_start[_n-1]~="(0,0)"
replace lat_long_start=lat_long_start[_n-1] if user_id==user_id[_n-1] & (latitude_start==. | longitude_start==.) & latitude_start[_n-1]~=. & longitude_start[_n-1]~=.

drop temp temp1 temp_total temp_total1 fre_loc


count if lat_long_start=="(.,.)"
count if lat_long_start=="(0,0)"

* drop individuals who have no geo information at all
drop if lat_long_start=="(.,.)"


* merge addr info
merge m:1 lat_long_start using working_data/clean_addr.dta
drop if _merge==2
drop _merge

tab lat_long_start if city==""
gen city_null=1 if city==""
bysort user_id: egen total_city_null=total(city_null)
bysort user_id: gen total_trip=_N
bysort user_id: gen trip_index=_n
gen temp=1 if total_city_null==total_trip

tab temp if trip_index==1 // these people have no addr info at all
drop if temp==1

drop temp city_null total_city_null 
drop total_trip trip_index

//bysort user_id: egen highway_count=total(highway)
//gen high_way_ratio=highway_count/total_trip
//hist high_way_ratio
//sum high_way_ratio, d


**************************
* Add Weather Info 
**************************

gen date=date(trip_start_date, "YMD")
format date %td

sort city_urls date
merge m:1 city_urls date using working_data/weather_all.dta

drop if _merge==2

tab high_temper if city_urls=="" // if city info is missing, for sure weather info is missing
tab date if _merge==1 & city_urls~="", m  // if city info is not missing, for some dates, still don't have weather info, why??
										  // cannot get weather information for certain dates online
list city_urls date if _merge==1 & city_urls~=""

drop _merge
tab high_temper, m // weather info may be missing for several days, but can still use other days


* clean weather info

tab weather_1

gen sunny=1 if weather_1=="晴"
replace sunny=0 if sunny==.

gen temp=strpos(weather_1, "云")
gen cloudy=1 if temp~=0
replace cloudy=0 if cloudy==.
drop temp 
replace cloudy=1 if weather_1=="阴" | weather_1=="阴天"

gen temp=strpos(weather_1, "暴雨")
gen rain_storm=1 if temp~=0
replace rain_storm=0 if rain_storm==.
drop temp

gen temp=strpos(weather_1, "阵雨")
replace rain_storm=1 if temp~=0
drop temp

gen temp=strpos(weather_1, "雨")
gen rain=1 if temp~=0 & rain_storm==0
replace rain=0 if rain==.
drop temp

gen temp=strpos(weather_1, "雪")
gen snow=1 if temp~=0
replace snow=0 if snow==.
drop temp

gen windy=1 if weather_1=="刮风" | weather_1=="扬沙"
replace windy=0 if windy==.

gen foggy=1 if weather_1=="薄雾" | weather_1=="浮尘" | weather_1=="雾" | weather_1=="霾"
replace foggy=0 if foggy==.

tab weather_1 if sunny==0 & cloudy==0 & rain_storm==0 & rain==0 & snow==0 & windy==0 & foggy==0

tab weather_2 if weather_1=="-"

replace cloudy=1 if weather_1=="-" & weather_2=="多云"
replace cloudy=1 if weather_1=="-" & weather_2=="阴"
replace sunny=1 if weather_1=="-" & weather_2=="晴"
replace rain=1 if weather_1=="-" & weather_2=="小雨"



drop lat_long_start lat_long_end addr_start post_code weather_1 weather_2 wind1 wind2


* adjust missing values for low and high temper using the same day max
bysort user_id trip_start_date: egen high_temper_max=max(high_temper)
bysort user_id trip_start_date: egen low_temper_max=max(low_temper)
replace high_temper=high_temper_max if high_temper==. & high_temper_max~=.
replace low_temper=low_temper_max if low_temper==. & low_temper_max~=.

drop high_temper_max low_temper_max


* adjust missing values for low and high temper using the average value for this individual
bysort user_id: egen high_temper_mean=mean(high_temper)
bysort user_id: egen low_temper_mean=mean(low_temper)
replace high_temper=high_temper_mean if high_temper==. & high_temper_mean~=.
replace low_temper=low_temper_mean if low_temper==. & low_temper_mean~=.

drop high_temper_mean low_temper_mean




save working_data/clean_trip_level_geo_weather_081519.dta, replace
















**************************************
* Second Step: generate day-level data
**************************************

use working_data/clean_trip_level_geo_weather_081519.dta, clear

order user_id trip_start_date

sort user_id trip_start_date start_time_new

* hard brakes
bysort user_id trip_start_date: egen day_total_real_hard_brake_1=total(total_real_hard_brake_1)
bysort user_id trip_start_date: egen day_total_real_hard_brake_2=total(total_real_hard_brake_2)
bysort user_id trip_start_date: egen day_total_prev_hard_brake=total(total_prev_hard_brake)
bysort user_id trip_start_date: egen day_total_hard_brake=total(total_hard_brake)


bysort user_id trip_start_date: egen day_total_hard_brake_1=total(total_hard_brake_1)
bysort user_id trip_start_date: egen day_total_hard_brake_2=total(total_hard_brake_2)
bysort user_id trip_start_date: egen day_total_hard_brake_3=total(total_hard_brake_3)


bysort user_id trip_start_date: egen day_total_hb_close_turn_1=total(total_hard_brake_close_turn_1)
bysort user_id trip_start_date: egen day_total_hb_close_turn_2=total(total_hard_brake_close_turn_2)
bysort user_id trip_start_date: egen day_total_hb_close_turn_3=total(total_hard_brake_close_turn_3)


* hard turns
bysort user_id trip_start_date: egen day_total_hard_left=total(total_hard_left)
bysort user_id trip_start_date: egen day_total_hard_right=total(total_hard_right)
bysort user_id trip_start_date: egen day_total_hard_u=total(total_hard_u)
bysort user_id trip_start_date: egen day_total_hard_turn=total(total_hard_turn)


* risky behaviors
bysort user_id trip_start_date: egen day_total_agg_acc=total(total_agg_acc)
bysort user_id trip_start_date: egen day_total_agg_acc_alone=total(total_agg_acc_alone)
bysort user_id trip_start_date: egen day_total_real_agg_acc_1=total(total_real_agg_acc_1)
bysort user_id trip_start_date: egen day_total_real_agg_acc_2=total(total_real_agg_acc_2)
bysort user_id trip_start_date: egen day_total_agg_acc_only=total(total_agg_acc_only)


bysort user_id trip_start_date: egen day_total_phone_use=total(total_phone_use)
bysort user_id trip_start_date: egen day_total_phone_use_sec=total(total_phone_use_sec)
bysort user_id trip_start_date: egen day_total_traffic_jam=total(total_traffic_jam)
bysort user_id trip_start_date: egen day_total_traffic_sec=total(total_traffic_sec)
bysort user_id trip_start_date: egen day_total_acc=total(total_acc)

* distance, duration, and speed
bysort user_id trip_start_date: egen day_total_distance=total(distance)
bysort user_id trip_start_date: egen day_total_duration=total(duration)
gen day_avg_speed=day_total_distance/day_total_duration

bysort user_id trip_start_date: egen day_avg_speed2=mean(speed)

* drive at night
gen night_duration=duration if drive_at_night==1
bysort user_id trip_start_date: egen day_total_night_duration=total(night_duration)
gen night_duration_prop=day_total_night_duration/day_total_duration
gen day_drive_at_night=1 if day_total_night_duration>0
replace day_drive_at_night=0 if day_total_night_duration==0

* driving scores
bysort user_id trip_start_date: egen day_avg_control_score=mean(control_score)
bysort user_id trip_start_date: egen day_avg_cautious_score=mean(cautious_score)
bysort user_id trip_start_date: egen day_avg_focused_score=mean(focused_score)
bysort user_id trip_start_date: egen day_avg_drive_score=mean(drive_score)





* highway and rush hour
bysort user_id trip_start_date: egen day_total_highway=total(highway)
bysort user_id trip_start_date: egen day_rush_hour=total(rush_hour)


** sort using trip_start_time!!!! make sure the code can be replicated exactly
sort user_id trip_start_date start_time_new
bysort user_id trip_start_date: keep if _n==1 

sum day_total_distance, d
tab day_total_distance if day_total_distance>1000, m

sum day_total_duration, d
tab day_total_duration if day_total_duration>24, m

sum day_avg_speed, d
tab day_avg_speed if day_avg_speed>200, m
replace day_avg_speed=. if day_avg_speed>200


sum day_avg_control_score, d
sum day_avg_cautious_score, d
sum day_avg_focused_score, d
sum day_avg_drive_score, d

* drop trip level variables
drop trip_id start_time end_time longitude* latitude*  start_time_new end_time_new
drop control_score cautious_score focused_score drive_score
drop distance total_real_hard_brake_1 total_real_hard_brake_2 total_prev_hard_brake ///
	 total_hard_brake total_hard_brake_1 total_hard_brake_2 total_hard_brake_3 ///
	 total_hard_brake_close_turn_1 total_hard_brake_close_turn_2 total_hard_brake_close_turn_3 ///
	 total_hard_left total_hard_right total_hard_u total_hard_turn     
drop total_agg_acc total_agg_acc_alone total_real_agg_acc_1 total_real_agg_acc_2 total_agg_acc_only
drop total_phone_use total_traffic_jam total_phone_use_sec total_traffic_sec ///
	 total_acc drive_at_night duration night_duration speed hour rush_hour highway date gap_time


********* Rename Day Level Variables

rename day_total_real_hard_brake_1 total_real_hard_brake_1
rename day_total_real_hard_brake_2 total_real_hard_brake_2
rename day_total_prev_hard_brake total_prev_hard_brake
rename day_total_hard_brake total_hard_brake


rename day_total_hard_brake_1 total_hard_brake_1
rename day_total_hard_brake_2 total_hard_brake_2
rename day_total_hard_brake_3 total_hard_brake_3

rename day_total_hb_close_turn_1 total_hard_brake_close_turn_1
rename day_total_hb_close_turn_2 total_hard_brake_close_turn_2
rename day_total_hb_close_turn_3 total_hard_brake_close_turn_3



rename day_total_hard_left total_hard_left
rename day_total_hard_right total_hard_right
rename day_total_hard_u total_hard_u
rename day_total_hard_turn total_hard_turn

rename day_total_agg_acc total_agg_acc
rename day_total_agg_acc_alone total_agg_acc_alone
rename day_total_real_agg_acc_1 total_real_agg_acc_1 
rename day_total_real_agg_acc_2 total_real_agg_acc_2 
rename day_total_agg_acc_only total_agg_acc_only

rename day_total_phone_use total_phone_use
rename day_total_phone_use_sec total_phone_use_sec
rename day_total_traffic_jam total_traffic_jam
rename day_total_traffic_sec total_traffic_sec
rename day_total_acc total_acc
rename day_total_distance distance
rename day_total_duration duration
rename day_avg_speed speed
rename day_avg_speed2 speed2
rename day_total_night_duration total_night_duration
rename day_drive_at_night drive_at_night
rename day_avg_control_score control_score
rename day_avg_focused_score focused_score
rename day_avg_cautious_score cautious_score
rename day_avg_drive_score drive_score

rename day_total_highway highway
rename day_rush_hour rush_hour

sort user_id trip_start_date
bysort user_id: gen day_index=_n

tsset user_id day_index
gen trip_start_date_new=date(trip_start_date, "YMD")

gen gap_days=trip_start_date_new-l.trip_start_date_new

drop trip_start_date_new day_index
rename gap_days gap_time  



** drop missing values

sum total_hard_brake total_hard_brake_1 total_hard_brake_2 total_hard_brake_3 ///
	total_hard_turn total_hard_left total_hard_right total_hard_u ///
	total_prev_hard_brake total_real_hard_brake_1 total_real_hard_brake_2 
	

sum	control_score cautious_score focused_score ///
	weekend rush_hour total_traffic_jam ///
	high_temper low_temper sunny cloudy rain_storm rain snow windy foggy  

	
sum total_agg_acc total_agg_acc_alone total_real_agg_acc_1 total_real_agg_acc_2 total_agg_acc_only ///
	total_phone_use distance duration speed drive_at_night highway 	
	

drop if high_temper==. | low_temp==. | speed==.
	

save working_data/clean_day_level_geo_weather_081519.dta, replace



